# -*- coding: utf-8 -*-
import scrapy,redis
# from ..settings import DEFAULT_REQUEST_HEADERS
from ..items import GovernmentprojectItem
class GovernmentSpider(scrapy.Spider):
    name = 'government'
    allowed_domains = ['sousuo.gov.cn']
    # start_urls = ['http://sousuo.gov.cn/column/31421/0.htm']
    r = redis.StrictRedis(host='', port=6379, password='', db=1, decode_responses=True)

    def start_requests(self):
        while True:
            url = self.r.lpop('link_url')
            if url:
                yield scrapy.Request(url,callback=self.parse_detail,dont_filter=True)
            else:
                break
    def parse_detail(self,response):
        item = GovernmentprojectItem()
        # 标题
        government_title = response.xpath('//h1/text()').extract_first()
        item['government_title'] = government_title.strip() if government_title else ''
        # 时间
        government_time = response.xpath('//div[@class="pages-date"]/text()').extract()
        item['government_time'] = government_time[0].strip() if government_time else ''
        # 来源
        government_source = response.xpath('//div[@class="pages-date"]/span/text()').extract()
        item['government_source'] = government_source[0].split('：')[-1].strip() if government_source else ''
        # 内容
        government_content = response.xpath('//div[@class="pages_content"]/p//text()').extract()
        item['government_content'] = str(government_content).replace('\\r', '').replace('\\n','').replace('\\t','').replace('\\xa0','').replace("['",'').replace("]'",'').replace("'",'').strip() if government_content else ''

        # 编辑
        government_compile = response.xpath('//div[@class="editor"]/text()').extract()
        item['government_compile'] = government_compile[0].split('：')[-1].strip() if government_compile else ''

        yield item
